{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "D2H6YLCPOXvZ",
        "outputId": "1a61101a-276b-4593-ee51-ac428a452015"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
            "Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (1.21.6)\n",
            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
            "Collecting transformers\n",
            "  Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)\n",
            "\u001b[K     |████████████████████████████████| 5.3 MB 4.8 MB/s \n",
            "\u001b[?25hRequirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.7/dist-packages (from transformers) (6.0)\n",
            "Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from transformers) (4.13.0)\n",
            "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (2022.6.2)\n",
            "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.7/dist-packages (from transformers) (21.3)\n",
            "Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from transformers) (3.8.0)\n",
            "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.7/dist-packages (from transformers) (4.64.1)\n",
            "Collecting tokenizers!=0.11.3,<0.14,>=0.11.1\n",
            "  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)\n",
            "\u001b[K     |████████████████████████████████| 7.6 MB 35.4 MB/s \n",
            "\u001b[?25hRequirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from transformers) (2.23.0)\n",
            "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (1.21.6)\n",
            "Collecting huggingface-hub<1.0,>=0.10.0\n",
            "  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)\n",
            "\u001b[K     |████████████████████████████████| 163 kB 61.4 MB/s \n",
            "\u001b[?25hRequirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0,>=0.10.0->transformers) (4.1.1)\n",
            "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging>=20.0->transformers) (3.0.9)\n",
            "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->transformers) (3.9.0)\n",
            "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (3.0.4)\n",
            "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (1.24.3)\n",
            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2022.9.24)\n",
            "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2.10)\n",
            "Installing collected packages: tokenizers, huggingface-hub, transformers\n",
            "Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.1 transformers-4.23.1\n",
            "Downloading...\n",
            "From: https://drive.google.com/uc?id=18oZZ4jqRK-uF-Nz6ftRdgNjKix88hrnO\n",
            "To: /content/data_and_models.zip\n",
            "100% 33.3M/33.3M [00:00<00:00, 140MB/s] \n",
            "Archive:  data_and_models.zip\n",
            "   creating: data_and_models/\n",
            "  inflating: __MACOSX/._data_and_models  \n",
            "  inflating: data_and_models/logistic_model_8.pkl  \n",
            "  inflating: __MACOSX/data_and_models/._logistic_model_8.pkl  \n",
            "  inflating: data_and_models/tfidf_44.pkl  \n",
            "  inflating: __MACOSX/data_and_models/._tfidf_44.pkl  \n",
            "  inflating: data_and_models/tfidf_8.pkl  \n",
            "  inflating: __MACOSX/data_and_models/._tfidf_8.pkl  \n",
            "  inflating: data_and_models/target_corpus.csv  \n",
            "  inflating: __MACOSX/data_and_models/._target_corpus.csv  \n",
            "  inflating: data_and_models/logistic_model_44.pkl  \n",
            "  inflating: __MACOSX/data_and_models/._logistic_model_44.pkl  \n"
          ]
        }
      ],
      "source": [
        "!pip install transformers\n",
        "!pip install --upgrade --no-cache-dir gdown==4.5.4\n",
        "\n",
        "!gdown 18oZZ4jqRK-uF-Nz6ftRdgNjKix88hrnO\n",
        "!unzip data_and_models.zip && rm data_and_models.zip"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "-y9uB6E4gtIa",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "27a35932-e3c3-4224-b357-1dc2299a7d3b"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "Token indices sequence length is longer than the specified maximum sequence length for this model (1357 > 512). Running this sequence through the model will result in indexing errors\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "####################\n",
            "Summary statisticis\n",
            "Number of samples: 4165\n",
            "Mean: 213\n",
            "Max 4823\n",
            "Min 10\n",
            "Standard deviation 358\n",
            "Number of samples with sequence length > 512: 533\n",
            "####################\n"
          ]
        }
      ],
      "source": [
        "import csv\n",
        "\n",
        "import numpy as np\n",
        "from transformers import RobertaTokenizerFast\n",
        "\n",
        "texts = []\n",
        "with open(\"data_and_models/target_corpus.csv\") as doc:\n",
        "  reader = csv.reader(doc)\n",
        "  next(reader)\n",
        "  for row in reader:\n",
        "    texts.append(row[0])\n",
        "\n",
        "tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')\n",
        "encodings = tokenizer(texts, truncation=False, padding=False)\n",
        "\n",
        "sequence_lengths = [len(encoding) for encoding in encodings[\"input_ids\"]]\n",
        "print(\"#\" * 20)\n",
        "print(\"Summary statisticis\")\n",
        "print(\"Number of samples:\", len(sequence_lengths))\n",
        "print(\"Mean:\", round(np.mean(sequence_lengths)))\n",
        "print(\"Max\", np.max(sequence_lengths))\n",
        "print(\"Min\", np.min(sequence_lengths))\n",
        "print(\"Standard deviation\", round(np.std(sequence_lengths)))\n",
        "print(\"Number of samples with sequence length > 512:\", sum([l > 512 for l in sequence_lengths]))\n",
        "print(\"#\" * 20)"
      ]
    }
  ],
  "metadata": {
    "colab": {
      "provenance": []
    },
    "gpuClass": "standard",
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}